{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load related packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cPickle\n",
    "import numpy as np\n",
    "import keras\n",
    "import tensorflow as tf\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.datasets import mnist\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout\n",
    "from keras.optimizers import RMSprop"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Function related to loading data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Pfam_from_pickle_file_encoding(name_list_pickle_filename,model_names_list_filename):\n",
    "\twith open(name_list_pickle_filename,'r') as f:\n",
    "\t\tname_list=cPickle.load(f)\n",
    "\n",
    "\twith open(model_names_list_filename,'r') as f:\n",
    "\t\tmodel_list=cPickle.load(f)\n",
    "\n",
    "\tencoding=[]\n",
    "\tfor i in range(len(name_list)):\n",
    "\t\tif i%10000==0:\n",
    "\t\t\tprint('Processing %dth sequence.'%i)\n",
    "\t\tsingle_encoding=np.zeros(16306)\n",
    "\t\tif name_list[i] != []:\n",
    "\t\t\tfor single_name in name_list[i]:\n",
    "\t\t\t\tsingle_encoding[model_list.index(single_name)]=1\n",
    "\t\t# encoding.append(single_encoding.reshape([5000, 4]))\n",
    "\t\tencoding.append(single_encoding)\n",
    "\treturn encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing 0th sequence.\n",
      "Processing 10000th sequence.\n",
      "Processing 20000th sequence.\n",
      "Processing 0th sequence.\n",
      "Processing 10000th sequence.\n",
      "Processing 20000th sequence.\n"
     ]
    }
   ],
   "source": [
    "enzyme_feature=Pfam_from_pickle_file_encoding(\n",
    "    'Pfam_name_list_new_data.pickle',\n",
    "    'Pfam_model_names_list.pickle')\n",
    "non_enzyme_feature=Pfam_from_pickle_file_encoding(\n",
    "    'Pfam_name_list_non_enzyme.pickle',\n",
    "    'Pfam_model_names_list.pickle')\n",
    "feature = np.concatenate([enzyme_feature, non_enzyme_feature], axis=0)\n",
    "label = np.concatenate([np.ones([22168,1]), np.zeros([22168,1])], axis=0).flatten()\n",
    "label = tf.keras.utils.to_categorical(label,num_classes=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Define hyper-parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "test_ratio = 0.1 # how much data for training and how much data for testing\n",
    "number_class = 2 # total number of classes, useful for define network structure\n",
    "number_features = 16306 # total number of feature, useful for define network structure\n",
    "batch_size = 1024 # stochastic gradient descent, training batch size\n",
    "epochs = 5 # training epoches"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Splite training data and testing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train, x_test, y_train, y_test = train_test_split(\n",
    "    feature, label, test_size=test_ratio, random_state=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build the network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "dense_7 (Dense)              (None, 1024)              16698368  \n",
      "_________________________________________________________________\n",
      "dropout_5 (Dropout)          (None, 1024)              0         \n",
      "_________________________________________________________________\n",
      "dense_8 (Dense)              (None, 1024)              1049600   \n",
      "_________________________________________________________________\n",
      "dropout_6 (Dropout)          (None, 1024)              0         \n",
      "_________________________________________________________________\n",
      "dense_9 (Dense)              (None, 2)                 2050      \n",
      "=================================================================\n",
      "Total params: 17,750,018\n",
      "Trainable params: 17,750,018\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = Sequential() # linear stack of layers\n",
    "model.add(Dense(1024, activation='relu', input_shape=(number_features,))) # fully connected layer\n",
    "model.add(Dropout(0.3)) # dropout some nodes to avoid overfitting\n",
    "model.add(Dense(1024, activation='relu')) # fully conncted layer\n",
    "model.add(Dropout(0.3)) # dropout\n",
    "model.add(Dense(number_class, activation='softmax')) # final classification layer\n",
    "model.summary() # summarize the model structure and parameters"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Define loss, optimizer (update rule), and metrics of monitoring the training process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer=keras.optimizers.adam(),\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Run the training loop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 39902 samples, validate on 4434 samples\n",
      "Epoch 1/5\n",
      "39902/39902 [==============================] - 6s 144us/step - loss: 0.3966 - acc: 0.8318 - val_loss: 0.2212 - val_acc: 0.9283\n",
      "Epoch 2/5\n",
      "39902/39902 [==============================] - 6s 142us/step - loss: 0.1457 - acc: 0.9545 - val_loss: 0.1830 - val_acc: 0.9436\n",
      "Epoch 3/5\n",
      "39902/39902 [==============================] - 6s 140us/step - loss: 0.1083 - acc: 0.9663 - val_loss: 0.1853 - val_acc: 0.9452\n",
      "Epoch 4/5\n",
      "39902/39902 [==============================] - 5s 132us/step - loss: 0.0986 - acc: 0.9670 - val_loss: 0.1906 - val_acc: 0.9447\n",
      "Epoch 5/5\n",
      "39902/39902 [==============================] - 5s 120us/step - loss: 0.0916 - acc: 0.9697 - val_loss: 0.1960 - val_acc: 0.9454\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(x_train, y_train,\n",
    "                    batch_size=batch_size,\n",
    "                    epochs=epochs,\n",
    "                    verbose=1,\n",
    "                    validation_data=(x_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluate the trained model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Test loss:', 0.1959872514036361)\n",
      "('Test accuracy:', 0.9454217409840241)\n"
     ]
    }
   ],
   "source": [
    "score = model.evaluate(x_test, y_test, verbose=0)\n",
    "print('Test loss:', score[0])\n",
    "print('Test accuracy:', score[1])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
